{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RAW_DOCS_FOLDER   = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Docs\"\n",
    "# OUTPUT FOLDER FOR DOCUMENT PROCESSING - USED IN REMAINING STEPS\n",
    "DOCS_FOLDER       = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs\"\n",
    "# REMOVE ALL FILES IN OUPUT FOLDER FIRST (SO YOU CAN RE-RUN AT WILL)\n",
    "EMPTY_OUTPUT_FOLDER = True\n",
    "\n",
    "FILE_MASK = \".*\\.txt\"\n",
    "PARSE_HTML = True\n",
    "FILE_SIZE_LIMIT_CHARS = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Shared\n",
    "import re\n",
    "re_collapse_spaces = re.compile(\"\\s+\")\n",
    "\n",
    "def collapse_spaces(s):\n",
    "    return re_collapse_spaces.sub(\" \", s).strip()\n",
    "\n",
    "re1 = re.compile(\"[;:\\'\\\"\\*/\\),\\(\\|\\s]+\")\n",
    "def clean_str(s):\n",
    "    s = str(s).replace(\"'s\",\" \")\n",
    "    #doesn't work in regex\n",
    "    s = s.replace(\"-\", \" \").replace(\"\\\\\",\" \")\n",
    "    s = re1.sub(\" \",s).strip()\n",
    "    return collapse_spaces(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os, re, time\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "REPL = \".\\n\"\n",
    "\n",
    "def strip_non_ascii(text):\n",
    "    return ''.join(i for i in text if ord(i)<128)\n",
    "\n",
    "# Make common html tags line breaks\n",
    "def pre_process_text(txt):\n",
    "    txt = txt.replace(\"</li><li>\", REPL).replace(\"<li>\", REPL).replace(\"</li>\", REPL)\n",
    "    txt = txt.replace(\"<br>\", REPL)\n",
    "    txt = txt.replace(\"<br/>\", REPL)\n",
    "    txt = txt.replace(\"<br />\", REPL)\n",
    "    txt = txt.replace(\"<p>\",  REPL)\n",
    "    txt = txt.replace(\"<p/>\",  REPL)\n",
    "    txt = txt.replace(\"<p />\",  REPL)\n",
    "    txt = txt.replace(\"</p>\", REPL)\n",
    "    txt = txt.replace(\". .\",  REPL)\n",
    "    txt = txt.replace(\"&nbsp;\", \" \")\n",
    "    while \"..\" in txt:\n",
    "        txt = txt.replace(\"..\", \". \")\n",
    "    while \"  \" in txt:    \n",
    "        txt = txt.replace(\"  \", \" \")\n",
    "    return txt\n",
    "\n",
    "def visible(element):\n",
    "    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:\n",
    "        return False\n",
    "    elif re.match('<!--.*-->', strip_non_ascii(element)):\n",
    "        return False\n",
    "    return True\n",
    "\n",
    "def get_text(html):\n",
    "    bs = BeautifulSoup(html)\n",
    "    texts = bs.findAll(text=True)\n",
    "    visible_texts = filter(visible, texts)\n",
    "    return REPL.join(visible_texts)\n",
    "\n",
    "def parse_html(html):\n",
    "    txt = get_text(pre_process_text(html))\n",
    "    return txt\n",
    "\n",
    "def split_into_sentences(txt):\n",
    "    txt = strip_non_ascii(txt)\n",
    "    sents = map(clean_str,sent_tokenize(txt))\n",
    "    return filter(lambda s: len(s.strip()) > 5, sents)\n",
    "\n",
    "def find_files(folder, regex, remove_empty = False):\n",
    "    \"\"\"\n",
    "    Find all files matching the [regex] pattern in [folder]\n",
    "\n",
    "    folder  :   string\n",
    "                    folder to search (not recursive)\n",
    "    regex   :   string (NOT regex object)\n",
    "                    pattern to match\n",
    "    \"\"\"\n",
    "    files = os.listdir(folder)\n",
    "    matches = [os.path.abspath(os.path.join(folder, f))\n",
    "               for f in files\n",
    "               if re.search(regex, f, re.IGNORECASE)]\n",
    "\n",
    "    if remove_empty:\n",
    "        matches = [f for f in matches if os.path.getsize(f) > 0]\n",
    "    matches.sort()\n",
    "    return matches\n",
    "\n",
    "def delete_files(folder, regex):\n",
    "    \"\"\" Deletes files in [folder] that match [regex] \n",
    "        e.g. delete_files(\"C:/Dice Data/DelTest\", \".*\\.txt\", 30)\n",
    "\n",
    "        folder      :   string\n",
    "                            folder to search\n",
    "        regex       :   string\n",
    "                            file pattern to match\n",
    "    \"\"\"\n",
    "    matches = find_files(folder, regex)\n",
    "    for full_path in matches:\n",
    "        os.remove(full_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "1000\n",
      "2000\n",
      "5000\n",
      "6000\n",
      "7000\n",
      "8000\n",
      "9000\n",
      "10000\n",
      "11000\n",
      "12000\n",
      "13000\n",
      "14000\n",
      "15000\n",
      "16000\n",
      "20000\n",
      "21000\n",
      "22000\n",
      "23000\n",
      "25000\n",
      "26000\n",
      "27000\n",
      "28000\n",
      "29000\n",
      "30000\n",
      "31000\n",
      "32000\n",
      "33000\n",
      "34000\n",
      "35000\n",
      "36000\n",
      "37000\n",
      "38000\n",
      "41000\n",
      "42000\n",
      "45000\n",
      "46000\n",
      "47000\n",
      "48000\n",
      "49000\n",
      "51000\n",
      "52000\n",
      "53000\n",
      "54000\n",
      "55000\n",
      "56000\n",
      "57000\n",
      "58000\n",
      "60000\n",
      "61000\n",
      "62000\n",
      "63000\n",
      "64000\n",
      "65000\n",
      "66000\n",
      "67000\n",
      "68000\n",
      "69000\n",
      "70000\n",
      "71000\n",
      "72000\n",
      "73000\n",
      "74000\n",
      "76000\n",
      "77000\n",
      "Loading and processing documents took 539.035063028 seconds\n"
     ]
    }
   ],
   "source": [
    "import ntpath\n",
    "\n",
    "ntpath.basename(\"a/b/c\")\n",
    "def get_file_name(path):\n",
    "    head, tail = ntpath.split(path)\n",
    "    return tail or ntpath.basename(head)\n",
    "\n",
    "start = time.time()\n",
    "\n",
    "if EMPTY_OUTPUT_FOLDER:\n",
    "    if DOCS_FOLDER == RAW_DOCS_FOLDER:\n",
    "        print(\"ERROR - Can't empty output folder if the same as the input folder\")\n",
    "    else:\n",
    "        delete_files(DOCS_FOLDER,\".*\")\n",
    "    \n",
    "files = find_files(RAW_DOCS_FOLDER, FILE_MASK, True)\n",
    "for i, fpath in enumerate(files):\n",
    "    with open(fpath) as f:\n",
    "        contents = f.read()\n",
    "        if len(contents) < FILE_SIZE_LIMIT_CHARS:\n",
    "            continue\n",
    "        if PARSE_HTML:\n",
    "            contents = parse_html(contents)\n",
    "            if len(contents) < FILE_SIZE_LIMIT_CHARS:\n",
    "                continue\n",
    "\n",
    "        sents = split_into_sentences(contents)\n",
    "        doc = \"\\n\".join(sents)\n",
    "        \n",
    "        file_name = get_file_name(fpath)        \n",
    "        fout_name = DOCS_FOLDER + \"/\" + file_name.split(\".\")[0] + \"_proc.txt\"\n",
    "        with open(fout_name, \"w+\") as fout:\n",
    "            fout.write(doc)\n",
    "    if i % 1000 == 0:\n",
    "        print(i)\n",
    "end = time.time()\n",
    "print(\"Loading and processing documents took %s seconds\" % str(end - start))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
